---
title: "Regressions"
author: "Zachary Steinert-Threlkeld"
date: "06.19.2018"
output:
  pdf_document:
  number_sections: yes
toc: yes
toc_depth: 4
---
<!-- 
Makes the regressions, robustness checks, and some summary statistics.
-->

# Setup
```{r setup, include=TRUE}
#knitr::opts_chunk$set(echo = TRUE)
#knitr::opts_knit$set(root.dir ='<path/to/Replication/>')
#library(knitr)
knitr::opts_chunk$set(tidy.opts=list(width.cutoff=80),tidy=TRUE)
library(plyr)
library(stargazer)
library(dplyr) # For aggregation
library(tidyr)  # For complete function
library(caret)
library(xtable)
library(car)  # For vif
library(stats)  # For pacf
library(effects) # For marginal effects
library(MASS)  # For glm.nb
library(mfx)  # For count model robustness checks
library(pscl)  # For zeroinfl() model
library(RCurl) # For getURL
library(plm)
library(olsrr)
library(vars)  # for vector autoregression
```


```{r}
#setwd('<path/to/Replication/>')
```
# Functions
## Main Models, Cross Validation
```{r, cache=TRUE}

makeModels_crossValidate <- function(data, DV, DV_lag, method, cluster=c('city_use')){
  train_control <- trainControl(method='cv', number=5)
  
  data$DV = data[[DV]]  # Take the user passed string
  data$DV_lag = data[[DV_lag]]

  violence <- DV ~  protest_result.protester_violence_lag  + protest_result.state_violence_lag + I(protest_result.state_violence_lag*protest_result.state_violence_lag) + police_lag + fire_lag + tweets_lag + DV_lag + factor(city_use)

  demographics <- DV ~ facesMale_Perc_lag +  youngAdultPerc_lag + tweets_lag + DV_lag + factor(city_use)

  combined <- DV ~ protest_result.protester_violence_lag + protest_result.state_violence_lag + I(protest_result.state_violence_lag*protest_result.state_violence_lag) + police_lag + fire_lag + facesMale_Perc_lag +  youngAdultPerc_lag +  tweets_lag + DV_lag + factor(city_use)
  
  m_violence <- train(violence, data=data, trControl=train_control, method=method, na.action=na.omit, tuneLength=2)
  m_demographics <- train(demographics, data=data, trControl=train_control, method=method, na.action=na.omit, tuneLength=2)
  m_combined <- train(combined, data=data, trControl=train_control, method=method, na.action=na.omit, tuneLength=2)
  
  
  # Make cluster SEs.
  # The summary function requires a named DV and for some reason does not work if call summary on an object that is lm of a formula object, has to be like below.
  lm_violence <- plm(DV ~  protest_result.protester_violence_lag  + protest_result.state_violence_lag + I(protest_result.state_violence_lag*protest_result.state_violence_lag) + police_lag + fire_lag + tweets_lag + DV_lag, data=data, index=c('city_use'), model=c('within'))
  lm_demographics <- plm(DV ~ facesMale_Perc_lag +  youngAdultPerc_lag + tweets_lag + DV_lag, data=data, index=c('city_use'), model=c('within')) 
  lm_combined <- plm(DV ~ protest_result.protester_violence_lag + protest_result.state_violence_lag + I(protest_result.state_violence_lag*protest_result.state_violence_lag) + police_lag + fire_lag +  facesMale_Perc_lag +  youngAdultPerc_lag +  tweets_lag + DV_lag, data=data, index=c('city_use'), model=c('within'))
  
  # Below is better than the commented out because I can see which coefficient it corresponds to.  as.vector prints better but still misses one, so stargazer doesn't align things quite right.
  m_violence_clusterSE <- coeftest(lm_violence, vcov=vcovHC(lm_violence, cluster='group'))[,2]
  m_demographics_clusterSE <- coeftest(lm_demographics, vcov=vcovHC(lm_demographics, cluster='group'))[,2]
  m_combined_clusterSE <-  coeftest(lm_combined, vcov=vcovHC(lm_combined, cluster='group'))[,2]
  
  return(list(m_violence=m_violence, m_demographics=m_demographics, m_combined=m_combined, m_violence_clusterSE=m_violence_clusterSE, m_demographics_clusterSE=m_demographics_clusterSE, m_combined_clusterSE=m_combined_clusterSE))
}

```


## Main Models, Weighted
NB: Below does not use clustered SEs because the modified summary function does not work with train() models and plm does not work with weights (even though weight is an argument). 
```{r}
makeModels_crossValidate_weighted <- function(data, DV, DV_lag, method, cluster=c('city_use'), weight=data$tweets_lag){
  train_control <- trainControl(method='cv', number=5)
  
  data$DV = data[[DV]]  # Take the user passed string
  data$DV_lag = data[[DV_lag]]

  violence <- DV ~  protest_result.protester_violence_lag  + protest_result.state_violence_lag + I(protest_result.state_violence_lag*protest_result.state_violence_lag) + police_lag + fire_lag + tweets_lag + DV_lag + factor(city_use)

  demographics <- DV ~ facesMale_Perc_lag +  youngAdultPerc_lag +  tweets_lag + DV_lag + factor(city_use)

  combined <- DV ~ protest_result.protester_violence_lag + protest_result.state_violence_lag + I(protest_result.state_violence_lag*protest_result.state_violence_lag) + police_lag + fire_lag + facesMale_Perc_lag +  youngAdultPerc_lag +  tweets_lag + DV_lag + factor(city_use)

  m_violence <- train(violence, data=data, trControl=train_control, method=method, na.action=na.omit, tuneLength=2, weights=weight)
  m_demographics <- train(demographics, data=data, trControl=train_control, method=method, na.action=na.omit, tuneLength=2, weights=weight)
  m_combined <- train(combined, data=data, trControl=train_control, method=method, na.action=na.omit, tuneLength=2, weights=weight)
  
  m_violence_clusterSE <- summary(m_violence)$coefficients[,2]
  m_demographics_clusterSE <- summary(m_demographics)$coefficients[,2]
  m_combined_clusterSE <-  summary(m_combined)$coefficients[,2]
  
  return(list(m_violence=m_violence, m_demographics=m_demographics, m_combined=m_combined, m_violence_clusterSE=m_violence_clusterSE, m_demographics_clusterSE=m_demographics_clusterSE, m_combined_clusterSE=m_combined$clusterSE))
}
```




## Make Short
```{r}
makeShortData <- function(data, thesecities){
  # Subset cities
  short <- subset(data, city_use %in% thesecities)
  
  # Get rid of bad observations
  short <- short[short$country != 'EG',]  # Had wrong dates for Egypt
  short <- short[short$city_use != 'Guayabal',]  # Country level for Venezuela
  short <- short[short$city_use != 'Puerto Carreno',]  # Country level for Venezuela as well
  short <- short[short$city_use != 'Baykit',]  # Russia country  level
  short <- short[short$city_use != 'Dobrovelychkivka',]  # Ukraine country level
  short <- short[short$city_use != 'Pir Mahal',]  # Pakistan country level
  short <- short[short$city_use != 'Asilah',]  # A town in Morocco, not sure how it got here
  
  short <- short[short$country != 'RU',]  # Cross-section, not temporal
  short <- short[short$city_use != 'Smilavichy',]  # Centroid for Belarus, so country level.  

  return(short) 
}
```


## Drop Certain Days
```{r}
dropDays <- function(data, cc, remove){
  temp <- data[data$country == cc,]
  
  days <- sort(unique(temp$day))
  
  start <- days[1:remove]
  end <- days[(length(days)-remove):length(days)]
  
  keep <- grep(paste(c(start, end), collapse='|'), temp$day, invert=TRUE)
  
  temp <- temp[keep,]
  
  return(temp)
}

dropDays_parent <- function(data){
  toUse <- unique(data$country)

  blah <- NULL
  for(i in 1:length(toUse)){
    temp <- dropDays(data=data, cc=toUse[i], remove=7)
    blah <- rbind(blah, temp)
  }

  return(blah) 
}
```


## Add Some Demographics
```{r}
# Add .00001 so do not get NAs
addDemo <- function(data){
  data$youngAdultPerc <- data$faces20_29_Sum/(data$totalFaces_Sum+.00001)
  data$youngAdultPerc_lag <- data$faces20_29_Sum_lag/(data$totalFaces_Sum_lag+.00001)

  data$facesMale_Perc <- data$facesMale_Sum/(data$totalFaces_Sum+.001)
  data$facesMale_Perc_lag <- data$facesMale_Sum_lag/(data$totalFaces_Sum_lag+.00001)
  
  return(data)
}
```

# Open Data
```{r, error=TRUE, cache=FALSE, eval=TRUE, echo=TRUE}
country <- read.csv('./Data/02_processedData/e_DonghyeonAlexmerged_cityday_UsersAndMissingDays.csv', stringsAsFactors=FALSE)
country$country <- country$place.country_code
country <- addDemo(data=country)
```

## Update column names
I had put "binary_" in front of a lot, not sure what I was thinking.
```{r}
names(country) <- gsub('binary_', '', names(country))
```

## Add Day Information to Tweets
```{r}
country$dayOfWeek <- weekdays(as.Date(country$day), abbreviate=FALSE)
country$weekend <- ifelse(grepl(paste('Saturday', 'Sunday', sep='|'), country$dayOfWeek)==TRUE, 1, 0)
```

## Drop Days
This subsection drops days from the start and end of a protest.  When the data were pulled from Twitter, two weeks before and after protests' start was chosen.  This trims.
```{r}
country <- dropDays_parent(data=country)
```

## Add Number of Protest Days Control
```{r}
country <- data.frame(country %>% group_by(city_use) %>% mutate(protest_duration = row_number()))
```

## Add Number of Repression Days Control
See here for the code I used: https://stackoverflow.com/questions/19998836/count-consecutive-occurrences-of-values-in-a-single-column.
```{r}
# x is vector
consecutiveDays <- function(x){
  x[x > 1] <- 1  # I only care that state violence photos were shared, not how many.
  counter <- sequence(rle(as.character(x))$lengths)  # Counts number of occurrences in a row
  temp_df <- data.frame(temp=x, counter=counter)  # Make df so I can replace in a row occurrences of no state violence
  temp_df$counter[temp_df$temp==0] <- 0 # Do not care about runs of no state violence

  return(temp_df$counter)
}

country <- data.frame(country %>% group_by(city_use) %>% mutate(state_violence_duration = consecutiveDays(state_violence)))
```

## Identify Cities
Use this object to filter countries wanted.  This is based on observing how many tweets per day they have from `ProcesssFromPythonAggregation` script.

Which cities to use?  5,109 have a day with a tweet.
```{r}
citydays <- country %>% group_by(country, city_use) %>% summarize(daysWithTweets=length(tweets[tweets>0]))

countrydays <- country %>% group_by(country) %>% summarize(days=length(unique(day)))

# Add number of days of each protest period
citydays <- merge(citydays, countrydays, by.x=c('country'), by.y=c('country'))

# Get rate
citydays$rate <- citydays$daysWithTweets/citydays$days
citydays <- citydays[order(citydays$rate, decreasing=TRUE),]

# Subset
#smaller <- citydays[citydays$country %in% thesecountries,]

percThreshold <- 1/7  # For one day per week
thesecities <- citydays$city_use[citydays$rate>=percThreshold]
```

# Make Subset
I save this subset out to make some later data loading easier.
```{r}
short <- makeShortData(data=country, thesecities=thesecities)
short2 <- makeShortData(data=country, thesecities=thesecities)

write.csv(short, 'Data/02_processedData/f_eParsedForMainRegressions.csv', row.names=FALSE)
```

# How Many Tweets
```{r}
sum(short$tweets)
nrow(short[short$tweets>0,])
```

## Look at cities, countries
```{r}
table(short$city_use, short$country)
```

# Figure A14
This section investigates how colinear any variables in the models are.  If any are, I will then remove them from the models.
```{r}
these <- c(#'n_face', 
           'totalFaces_log', 
           #'protest_result.violence_lag', 
           'protest_result.protester_violence_lag', 
           'protest_result.state_violence_lag', 
           'police_lag', 
           'fire_lag', 
           #'groupAny_lag',
           #'group_20_lag',
           #'group_100_lag',
           #'entropyGender_Mean_lag', 
           #'entropyRace_Mean_lag', 
           #'children_lag',
           #'entropyRace_byDay_lag',
           'facesMale_Perc_lag',
           #'entropyAge_byDay_lag'
           'youngAdultPerc_lag',
           # 'tweets_lag', 
           # 'n_face_lag', 
           'n_face_lag_log'
           # 'violence_lag', 
           # 'protester_violence_lag', 
           # 'state_violence_lag'
           )

hm <- cor(short[these], use='complete.obs')  # These defined earlier
hm[,findCorrelation(hm, cutoff=.5)]  # Great, only group_20 and group_100.  
```
Here is a heatmap, kind of. 

```{r}
covarlabels <- c(#'Protest Size$_{i,t}$', 
                 'Log(Protest Size)$_{i,t}$', 
                 #'Perceived Violence$_{i,t-1}$', 
                 'Perceived Prtstr Violence$_{i,t-1}$', 
                 'Perceived Stt Violence$_{i,t-1}$', 
                 'Police$_{i,t-1}$', 
                 'Fire$_{i,t-1}$', 
                 #'Any Group$_{i,t-1}$',
                 #'Small Group$_{i,t-1}$',
                 #'Large Group$_{i,t-1}$',
                 'Percent Male$_{i,t-1}$', 
                 #'Race Diversity$_{i,t-1}$',  
                 'Percent Young Adult$_{i,t-1}$',  
                 #'Any Child$_{i,t-1}$'
                 #'Tweets$_{i,t-1}$', 
                 #'Faces Total$_{i,t-1}$', 
                 'Log(Protest Size)$_{i,t-1}$'
                 #'Violence$_{i,t-1}$', 
                 #'Prtstr Violence$_{i,t-1}$',
                 #'Stt Violence$_{i,t-1}$'
                 )

covar_labels_fig <- list(
	#expression(Protest~Size[paste(i,',','t')]),
	expression(Log(Protest~Size[paste(i,',','t')])),
	#expression(Perc.~Violence[paste(i,',','t-1')]),
  expression(Perc.~Prtstr.~Violence[paste(i,',','t-1')]),
  expression(Perc.~Stt.~Violence[paste(i,',','t-1')]),
	expression(Police[paste(i,',','t-1')]),
	expression(Fire[paste(i,',','t-1')]),
	#expression(Any~Grp.[paste(i,',','t-1')]),
	#expression(Small~Grp.[paste(i,',','t-1')]),
	#expression(Faces~Avg.[paste(i,',','t-1')]),
	#expression(Large~Grp.[paste(i,',','t-1')]),
	expression(Perc.~Male[paste(i,',','t-1')]),
	#expression(Race~Div.[paste(i,',','t-1')]),
	expression(Perc~Yng.~Adlt.[paste(i,',','t-1')]),
	#expression(Any~Child[paste(i,',','t-1')])
	#expression(paste(PV,'*',Police)[paste(i,',','t-1')]),
	# expression(Tweets[paste(i,',','t')]),
	# expression(Protest~Size[paste(i,',','t-1')]),
	 expression(Log(Protest~Size[paste(i,',','t-1')]))
	# expression(Violence[paste(i,',','t-1')]),
	# expression(Prtstr~Violence[paste(i,',','t-1')]),
	# expression(Stt.~Violence[paste(i,',','t-1')])
	)

covar_labels_fig_nolist <- NULL
for(i in 1:length(covar_labels_fig)){
  covar_labels_fig_nolist[i] <- covar_labels_fig[[i]]
}

newx <-rep(1:nrow(hm), nrow(hm))
newy <- rep(1:nrow(hm), each=nrow(hm))
jpeg('./Figures/variablesheatmapnumbers_DonghyeonAlexmerged.jpeg', width=8, height=8, units='in', res=500)
par(mai=c(2,2,1,1))
plot(x= newx, y=newy, pch='', xlab='', ylab='', xaxt='n', yaxt='n')
text(x=rep(1:nrow(hm), nrow(hm)), y=rep(1:nrow(hm), each=nrow(hm)), labels=as.matrix(round(hm,2)), cex=1)
axis(side=1, at=1:nrow(hm), labels=covar_labels_fig_nolist, las=2, cex.axis=1)
axis(side=2, at=1:nrow(hm), labels=covar_labels_fig_nolist, las=1, cex.axis=1)
dev.off()
```

# Descriptive Statistics
## Table 5
The below is the summary statistic table for the paper, Table 5.  
```{r}
these <- c('n_face', 
           #'protest_result.violence_lag',
           'protest_result.protester_violence_lag',
           'protest_result.state_violence_lag',
           'police_lag', 
           'fire_lag', 
           #'totalFaces_log', 
           'children_lag', 
           'facesMale_Perc_lag',
           'youngAdultPerc_lag',
           'entropyRace_byDay_lag', 
           'entropyAge_byDay_lag',
           #'violence',
           'protester_violence',
           'state_violence',
           #'ViolencePolice_lag', 
           #'ViolencePoliceFire_lag', 
           #'photo_lag',
           #'group_20_lag',  
           #'group_100_lag' ,
           #'groupAny_lag', 
           'tweets_lag' 
           #'n_face_lag'
           #'n_face_lag_log')
)

tosummarize <- short[short$city_use %in% thesecities,]
tosummarize <- tosummarize[,these]

these_labels <- c('Protest Size', 
           #'totalFaces_log', 
           #'Perceived Violece',
           'Perceived Protester Violence',
           'Perceived State Violence',
           'Photos w/ Police', 
           'Photos w/ Fire', 
           #'ViolencePolice_lag', 
           #'ViolencePoliceFire_lag', 
           #'photo_lag',
           'Photos w/ Children', 
           'Percent Male', 
           'Percent 20-29', 
           'Race Diversity',
           'Age Diversity',
           #'Photos w/ Violence',
           'Photos w/ Prtstr Violence',
           'Photos w/ Stt Violence',
           #'Photos w/ Small Group',  
           #'Photos w/ Large Group' ,
           #'Photos w/ Any Group', 
           'Photos per day'
           #'n_face_lag'
           #'n_face_lag_log')
)

stargazer(tosummarize, covariate.labels = these_labels, out='./Tables/Table5.tex', digits=2, omit.summary.stat = c("p25", "p75"))
```

## Exploration
```{r, error=TRUE, cache=FALSE, eval=TRUE, echo=TRUE}
# Countrydays
table(short$country)

# Tweets per day
temp <- data.frame(short %>% group_by(city_use) %>% summarize(tweets=sum(tweets), days=length(day)))
temp$tweetsDay <- temp$tweets/temp$days
temp
```

Tweest per day, when have tweets.
```{r}
# Tweets per day
this <- short[short$tweets > 0,]
temp <- data.frame(this %>% group_by(city_use) %>% summarize(tweets=sum(tweets), days=length(day)))
temp$tweetsDay <- temp$tweets/temp$days
temp
```

## How many zero days?
```{r}
hm <- short %>% group_by(city_use) %>% summarize(tweets=sum(tweets==0))
xtable(hm)
```

## How many tweets?
Also, how many days with tweets.
```{r}
sum(short$tweets)
nrow(short[short$tweets>0,])
```

# Main Model
### All-countries
```{r, error=TRUE, cache=FALSE, eval=TRUE, echo=TRUE}
models <- makeModels_crossValidate(data=short, method='lm', DV='totalFaces_log', DV_lag='totalFaces_lag_log')
```

### Table A5
```{r, error=TRUE, cache=FALSE, eval=TRUE, echo=TRUE, include=TRUE}
stargazer(models$m_violence$finalModel, models$m_demographics$finalModel, models$m_combined$finalModel, type='latex', out='./Tables/appendix_table_A5.tex', column.labels=c('Violence', 'Demographics', 'Combined'),  notes.align='l', digits=4, initial.zero=FALSE, dep.var.caption='DV: Sum of Faces', omit="factor", omit.stat=c('f', 'res.dev'), style='apsr', se = list(models$m_violence_clusterSE,models$m_demographics_clusterSE, models$m_combined_clusterSE))

```

### Main model diagnostics
#### Heteroskedasticity
Is there heteroskedasticity?
```{r}
temp <- lm(totalFaces_log ~  protest_result.protester_violence_lag + protest_result.state_violence_lag + 
    I(protest_result.state_violence_lag * protest_result.state_violence_lag) + 
    police_lag + fire_lag + group_100_lag + I(group_100_lag * 
    group_100_lag) + entropyGender_Mean_lag + entropyRace_Mean_lag + 
    children_lag + tweets_lag +totalFaces_lag_log + I(n_face_lag_log*n_face_lag_log), data=short)

ols_test_breusch_pagan(temp)
```
Yes.  

#### Variance inflation factor
Does a variance inflation factor suggest too much collinearity?

For the main model.
```{r}
# Use main model
temp <- lm(totalFaces_log ~  protest_result.protester_violence_lag + protest_result.state_violence_lag + 
    I(protest_result.state_violence_lag * protest_result.state_violence_lag) + 
    police_lag + fire_lag  + facesMale_Perc_lag +  youngAdultPerc_lag + tweets_lag +totalFaces_lag_log, data=short)
vif(temp)
```
Result: good VIF, all except for state violence and its interaction.


#### Autocorrelation function
```{r}
pdf('./Figures/pacf.pdf')
pacf(short$totalFaces_log, main='Partial Autocorrelation for Log(Faces)')
dev.off()
```

Result: 15 lags.

#### Regression F test
To see if should include squared terms.
```{r}
nested <-lm(totalFaces_log ~  protest_result.protester_violence_lag + protest_result.state_violence_lag +  police_lag + fire_lag  + facesMale_Perc_lag +  youngAdultPerc_lag + tweets_lag + totalFaces_lag_log, data=short)

full <- lm(totalFaces_log ~  protest_result.protester_violence_lag + protest_result.state_violence_lag + I(protest_result.state_violence_lag * protest_result.state_violence_lag) +  police_lag + fire_lag  + facesMale_Perc_lag +  youngAdultPerc_lag + tweets_lag + totalFaces_lag_log, data=short)

rss_nested <- sum(resid(nested)^2)
rss_full <- sum(resid(full)^2)
params_nested <- length(nested$coefficients) - 1 # -1 to not count intercept
params_full <- length(full$coefficients) - 1 
n <- nrow(country)

f <- ((rss_nested-rss_full)/(params_full-params_nested))/(rss_full/(n-params_full))  # degrees of freedom is (params_full-params_nested, n=params_full) --> (2, 1948)

f
```
f is 410.5181, definitely big enough

#### Stationarity?
Dickey-Fuller test.
```{r}
library(tseries)
adf.test(short$totalFaces_log)
```

p-value  is .01, so reject H_0 of non-stationary.  Data are stationary.

#### Lagrange Multiplier
```{r}
library(plm)

combined <- totalFaces_log ~ protest_result.protester_violence_lag + protest_result.state_violence_lag + I(protest_result.state_violence_lag*protest_result.state_violence_lag) + police_lag + fire_lag +  facesMale_Perc_lag +  youngAdultPerc_lag +  tweets_lag +totalFaces_lag_log+ factor(city_use)
  
short <- makeShortData(data=country, thesecities=thesecities)

data_new <- short[all.vars(combined)]
data_new[is.na(data_new)] <- 0

plm_ols <- plm(combined,data=data_new,model='within',index=c('city_use'))

# Lagrange-Multiplier Test
plmtest(plm_ols, type = "bp",effect="twoways") #chisq = 16.08, p ~ .0003, "alternative hypothesis: significant effects"

plmtest(plm_ols, type = "honda", effect = "twoways") # normal=.39757, p=.3455, H_A is significant effects

plmtest(plm_ols, type = "kw", effect = "twoways") #"normal" = -1.7933, p=..9635, alt hypotheis is sig. effects
```

Breusch/Pagan test says is good to go.

### Marginal effects
```{r}
short <- makeShortData(data=country, thesecities=thesecities)
ymax <- 3
ymax_log <- .5
```


#### Figure 3a
```{r}
city_use2 <- factor(short$city_use)  # Effect function not work with factor calls

full <- totalFaces_log ~ protest_result.protester_violence_lag + protest_result.state_violence_lag + I(protest_result.state_violence_lag*protest_result.state_violence_lag) + police_lag + fire_lag + facesMale_Perc_lag +  youngAdultPerc_lag +  tweets_lag + totalFaces_lag_log + city_use2

effect1 <- lm(full, data=short)
eff_1 <- Effect(c('protest_result.state_violence_lag'),mod=effect1,data=short,xlevels=list(protest_result.state_violence_lag=seq(0,1,by=.01)))
eff_1 <- data.frame(eff_1)

# Make percentage change
eff_1$percentChange <- (eff_1-eff_1$fit[1])/eff_1$fit[1]
#ymin <- min(eff_1$percentChange$fit, na.rm=TRUE)
ymin <- -1
ymax <- max(eff_1$percentChange$fit, na.rm=TRUE)

pdf('./Figures/fg3a.pdf')
par(mar=c(5,6,4,2), cex.lab=1.5, cex.axis=1.5, cex.main=1.5, cex.sub=1.5)
plot(eff_1$protest_result.state_violence_lag, y=eff_1$percentChange$fit*100, type='l', xlab=expression(State~Violence[paste(i,',','t-1')]), ylab=expression(Percent~Change~of~Protest~Size[paste(i,',','t')]), ylim=c(ymin*100,175), xlim=c(0,1))
rug(short$protest_result.state_violence_lag)
lines(x=eff_1$protest_result.state_violence_lag, y=eff_1$percentChange$lower*100, lty='dotted')
lines(x=eff_1$protest_result.state_violence_lag, y=eff_1$percentChange$upper*100, lty='dotted')
dev.off()
#ymin <- min(eff_1$percentChange$fit, na.rm=TRUE)
ymin <- -1
ymax <- max(eff_1$percentChange$fit, na.rm=TRUE)
```


#### Figure 3b
```{r}
ymax <- 3
city_use2 <- factor(short$city_use)  # Effect function not work with factor calls

full <- totalFaces_log ~ protest_result.protester_violence_lag + protest_result.state_violence_lag + I(protest_result.state_violence_lag*protest_result.state_violence_lag) + police_lag + fire_lag + facesMale_Perc_lag +  youngAdultPerc_lag +  tweets_lag +totalFaces_lag_log+ city_use2

effect1 <- lm(full, data=short)
eff_1 <- Effect(c('protest_result.protester_violence_lag'),mod=effect1,data=short,xlevels=list(protest_result.protester_violence_lag=seq(0,1,by=.01)))
eff_1 <- data.frame(eff_1)

# Percentage change
eff_1$percentChange <- (eff_1-eff_1$fit[1])/eff_1$fit[1]
ymin <- -1
ymax <- max(eff_1$percentChange$fit, na.rum=TRUE)

pdf('./Figures/fg3b.pdf')
par(mar=c(5,6,4,2), cex.lab=1.5, cex.axis=1.5, cex.main=1.5, cex.sub=1.5)
plot(eff_1$protest_result.protester_violence_lag, y=eff_1$percentChange$fit*100, type='l', xlab=expression(Protester~Violence[paste(i,',','t-1')]), ylab=expression(Percent~Change~of~Protest~Size[paste(i,',','t')]), ylim=c(ymin*100,175), xlim=c(0,1))
rug(short$protest_result.protester_violence_lag)
lines(x=eff_1$protest_result.protester_violence_lag, y=eff_1$percentChange$lower*100, lty='dotted')
lines(x=eff_1$protest_result.protester_violence_lag, y=eff_1$percentChange$upper*100, lty='dotted')
dev.off()

```


### Table A12: Models, One Country at a Time
```{r}
short <- makeShortData(data=country, thesecities=thesecities)

countries <- unique(short$country)

## Get countries
es <- short[short$country=='ES',]
hk <- short[short$country=='HK',]
kr <- short[short$country=='KR',]
pk <- short[short$country=='PK',]
ve1 <- short[short$country=='VE1',]
ve2 <- short[short$country=='VE2',]

## For PK, will need special model since few observations and only one city.
smallmodel <- totalFaces_log ~ protest_result.protester_violence_lag + protest_result.state_violence_lag + I(protest_result.state_violence_lag*protest_result.state_violence_lag) + fire_lag + facesMale_Perc_lag +  youngAdultPerc_lag +  tweets_lag + totalFaces_lag_log 

## Make models
es_model <- makeModels_crossValidate(data=es, method='lm', DV='totalFaces_log', DV_lag='totalFaces_lag_log')
hk_model <- makeModels_crossValidate(data=hk, method='lm', DV='totalFaces_log', DV_lag='totalFaces_lag_log')
kr_model <- makeModels_crossValidate(data=kr, method='lm', DV='totalFaces_log', DV_lag='totalFaces_lag_log')
pk_model <- lm(smallmodel, data=pk)
ve1_model <- makeModels_crossValidate(data=ve1, method='lm', DV='totalFaces_log', DV_lag='totalFaces_lag_log')
ve2_model <- makeModels_crossValidate(data=ve2, method='lm', DV='totalFaces_log', DV_lag='totalFaces_lag_log')


stargazer(es_model$m_combined$finalModel, hk_model$m_combined$finalModel, kr_model$m_combined$finalModel, ve1_model$m_combined$finalModel, ve2_model$m_combined$finalModel, type='latex', out='./Tables/appendix_table_a12.tex', column.labels=c('Spain', 'Hong Kong', 'South Korea', 'Venezuela, 2014-2015', 'Venezuela 2017'),  notes.align='l', digits=4, initial.zero=FALSE, dep.var.caption='DV: Sum of Faces', omit="factor", omit.stat=c('f', 'res.dev'), style='apsr')
```

# Robustness, Log Tweets Control Variable
This check was requested by a reviewer.  Results do not change and the paper does not mention.
Copy the data frame, change tweets_lag to log.
```{r}
new_short <- short
new_short$tweets_lag <- log(short$tweets_lag+1, 10)

models <- makeModels_crossValidate(data=new_short, method='lm', DV='totalFaces_log', DV_lag='totalFaces_lag_log')

stargazer(models$m_combined$finalModel, type='latex', out='./Tables/robusness_logtweets.tex', column.labels=c( 'Combined'),  notes.align='l', digits=4, initial.zero=FALSE, dep.var.caption='DV: Sum of Faces', omit="factor", omit.stat=c('f', 'res.dev'), style='apsr')
```


# Violence, Faces Correlation
## Figure 4
This section uses data at the photo level.  Since it is at the photo level, I cannot provide the raw data in order to protect anonymity.
```{r}
temp <- read.csv('./Data/02_processedData/c2_DonghyeonAlexmerged_classifiers_shortSpain.csv', stringsAsFactors=FALSE)

temp2 <- temp[temp$city_use %in% short$city_use,]

ggplot(temp2, aes(x=protest_result.state_violence, y=faces)) + geom_point() + geom_smooth(method = "loess", se = FALSE, span=.2) + labs(x=expression(State~Violence[paste(i,',','t')]), y=expression(Faces~per~Photo[paste(i,',','t')])) + theme_classic() + theme(axis.text.x = element_text( size=14), strip.text=element_text(size=14), axis.text.y = element_text(size=14), axis.title.y=element_text(size=14), axis.title.x=element_text(size=14)) + ylim(0,25)
ggsave('./Figures/fg4.jpg')
```

# Robustness, State Violence Non-Parametric
## Figure 5a
```{r}
# add .01 so do not divide by 0
# perc2 is how i had it in the original graph in paper
short <- data.frame(short %>% group_by(city_use) %>% mutate(totalFaces_log_perc = ((totalFaces_log+1)-(totalFaces_lag_log+1))/(totalFaces_lag_log+1), totalFaces_perc = ((totalFaces_Sum+1)-(totalFaces_Sum_lag+1))/(totalFaces_Sum_lag+1)))

ggplot(short, aes(protest_result.state_violence_lag, totalFaces_perc)) + geom_point() + geom_smooth(method = "loess", se = FALSE, span=1) + labs(x=expression(State~Violence[paste(i,',','t-1')]), y=expression(Percent~Change~of~Protest~Size[paste(i,',','t')])) + theme_classic() + theme(axis.text.x = element_text( size=14), strip.text=element_text(size=14), axis.text.y = element_text(size=14), axis.title.y=element_text(size=14), axis.title.x=element_text(size=14)) + ylim(0,50)
ggsave('./Figures/fg5a.jpg')
```


## Figure 5b
```{r}
# percent
ggplot(short, aes(protest_result.state_violence_lag, totalFaces_perc)) + geom_point() + geom_smooth(method = "lm", se = TRUE, formula = y ~ splines::bs(x, df=1)) + labs(x=expression(State~Violence[paste(i,',','t-1')]), y=expression(Percent~Change~of~Protest~Size[paste(i,',','t')])) + theme_classic() + theme(axis.text.x = element_text( size=14), strip.text=element_text(size=14), axis.text.y = element_text(size=14), axis.title.y=element_text(size=14), axis.title.x=element_text(size=14)) + ylim(0,50)
ggsave('./Figures/fg5b.jpg')
```


## Figure 5c
```{r}
short$protest_result.state_violence_lag_buckets10 <- cut(short$protest_result.state_violence_lag,
                        breaks=c(0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1),
                         right=TRUE, labels=FALSE)

city_use2 <- factor(short$city_use)  # Effect function not work with factor calls

full <- totalFaces_log ~ protest_result.protester_violence_lag + protest_result.state_violence_lag_buckets10 + I(protest_result.state_violence_lag_buckets10*protest_result.state_violence_lag_buckets10) + police_lag + fire_lag +  facesMale_Perc_lag +  youngAdultPerc_lag +  tweets_lag + totalFaces_lag_log+ city_use2

effect1 <- lm(full, data=short)
eff_1 <- Effect(c('protest_result.state_violence_lag_buckets10'),mod=effect1,data=short,xlevels=list(protest_result.state_violence_lag_buckets10=seq(0,10,by=1)))
eff_1 <- data.frame(eff_1)

# Make percentage change
eff_1$percentChange <- (eff_1-eff_1$fit[1])/eff_1$fit[1]
ymin <- -1
ymax <- max(eff_1$percentChange$fit, na.rm=TRUE)

pdf('./Figures/fg5c.pdf')
plot(x=seq(0, 10, 1), y=eff_1$percentChange$fit*100, type='l', xlab=expression(State~Violence~Bucket[paste(i,',','t-1')]), ylab=expression(Percent~Change~of~Protest~Size[paste(i,',','t')]), ylim=c(ymin*100, 175))
lines(x=seq(0, 10, 1), y=eff_1$percentChange$lower*100, lty='dotted')
lines(x=seq(0, 10, 1), y=eff_1$percentChange$upper*100, lty='dotted')
dev.off()
```

## Figure 5d
Partial out the covariates, run loess on the residuals.
```{r}
m1 <- lm(totalFaces_log ~ protest_result.protester_violence_lag  + police_lag + fire_lag +  facesMale_Perc_lag +  youngAdultPerc_lag +  tweets_lag + totalFaces_lag_log + factor(city_use), data=short)

# Do this to get residuals to match
ugh <- short[names(m1$residuals),]

toplot <- data.frame(violence=ugh$protest_result.state_violence_lag, residual = m1$residuals)

ggplot(toplot, aes(violence, residual)) + geom_point() + geom_smooth(method = "loess", se = FALSE) + labs(x=expression(State~Violence[paste(i,',','t-1')]), y=expression(Partial~Residual[paste(i,',','t')])) + theme_classic() + theme(axis.text.x = element_text( size=14), strip.text=element_text(size=14), axis.text.y = element_text(size=14), axis.title.y=element_text(size=14), axis.title.x=element_text(size=14)) + ylim(-3,3)
ggsave('./Figures/fg5d.jpg')
```

# Figure A19 
## LOESS
```{r}
# add .01 so do not divide by 0
# perc2 is how i had it in the original graph in paper
short <- data.frame(short %>% group_by(city_use) %>% mutate(totalFaces_log_perc = ((totalFaces_log+1)-(totalFaces_lag_log+1))/(totalFaces_lag_log+1), totalFaces_perc = ((totalFaces_Sum+1)-(totalFaces_Sum_lag+1))/(totalFaces_Sum_lag+1)))

ggplot(short, aes(protest_result.protester_violence_lag, totalFaces_perc)) + geom_point() + geom_smooth(method = "loess", se = FALSE, span=.7) + labs(x=expression(Protester~Violence[paste(i,',','t-1')]), y=expression(Percent~Change~of~Protest~Size[paste(i,',','t')])) + theme_classic() + theme(axis.text.x = element_text( size=14), strip.text=element_text(size=14), axis.text.y = element_text(size=14), axis.title.y=element_text(size=14), axis.title.x=element_text(size=14)) + ylim(-5,50)
ggsave('./Figures/loess_protesterviolence_percentChange_DonghyeonAlexmerged_cityday.jpg')
```


## LOESS residuals
Partial out the covariates, run loess on the residuals.
```{r}
m1 <- lm(totalFaces_log ~ protest_result.state_violence_lag + I(protest_result.state_violence_lag*protest_result.state_violence_lag)  + police_lag + fire_lag +  facesMale_Perc_lag +  youngAdultPerc_lag +  tweets_lag + totalFaces_lag_log + factor(city_use), data=short)

# Do this to get residuals to match
ugh <- short[names(m1$residuals),]

toplot <- data.frame(violence=ugh$protest_result.protester_violence_lag, residual = m1$residuals)

ggplot(toplot, aes(violence, residual)) + geom_point() + geom_smooth(method = "loess", se = FALSE) + labs(x=expression(Protester~Violence[paste(i,',','t-1')]), y=expression(Partial~Residual[paste(i,',','t')])) + theme_classic() + theme(axis.text.x = element_text( size=14), strip.text=element_text(size=14), axis.text.y = element_text(size=14), axis.title.y=element_text(size=14), axis.title.x=element_text(size=14)) + ylim(-2.5,2.5)
ggsave('./Figures/residualsPartialRegression_protesterviolence_log_DonghyeonAlexmerged_cityday.jpg')
```

## Split Into Buckets
```{r}
short$protest_result.protester_violence_lag_buckets10 <- cut(short$protest_result.protester_violence_lag,
                        breaks=c(0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1),
                         right=TRUE, labels=FALSE)

city_use2 <- factor(short$city_use)  # Effect function not work with factor calls

full <- totalFaces_log ~ protest_result.protester_violence_lag_buckets10 + protest_result.state_violence_lag + I(protest_result.state_violence_lag*protest_result.state_violence_lag) + police_lag + fire_lag +  facesMale_Perc_lag +  youngAdultPerc_lag +  tweets_lag + totalFaces_lag_log + city_use2

effect1 <- lm(full, data=short)
eff_1 <- Effect(c('protest_result.protester_violence_lag_buckets10'),mod=effect1,data=short,xlevels=list(protest_result.protester_violence_lag_buckets10=seq(0,10,by=1)))
eff_1 <- data.frame(eff_1)

# Make percentage change
eff_1$percentChange <- (eff_1-eff_1$fit[1])/eff_1$fit[1]
ymin <- -1
ymax <- max(eff_1$percentChange$fit, na.rm=TRUE)

pdf('./Figures/marginaleffects_protesterviolenceBuckets10_percentChange_DonghyeonAlexmerged_cityday.pdf')
plot(x=seq(0, 10, 1), y=eff_1$percentChange$fit*100, type='l', xlab=expression(Protester~Violence[paste(i,',','t-1')]), ylab=expression(Percent~Change~of~Protest~Size[paste(i,',','t')]), ylim=c(ymin*100, ymax*100), xlim=c(0,10))
lines(x=eff_1$protest_result.protester_violence_lag_buckets10, y=eff_1$percentChange$lower*100, lty='dotted')
lines(x=eff_1$protest_result.protester_violence_lag_buckets10, y=eff_1$percentChange$upper*100, lty='dotted')
dev.off()

```

## Spline 
```{r}
# percent
ggplot(short, aes(protest_result.protester_violence_lag, totalFaces_perc)) + geom_point() + geom_smooth(method = "lm", se = TRUE, formula = y ~ splines::bs(x, 100))+ labs(x=expression(Protester~Violence[paste(i,',','t-1')]), y=expression(Percent~Change~of~Protest~Size[paste(i,',','t')])) + theme_classic() + theme(axis.text.x = element_text( size=14), strip.text=element_text(size=14), axis.text.y = element_text(size=14), axis.title.y=element_text(size=14), axis.title.x=element_text(size=14)) + ylim(0,50)
ggsave('./Figures/spline_protesterviolence_percentChange_DonghyeonAlexmerged_cityday.jpg')
```

# Figure A18
This section is at the city-day level.
```{r}
ggplot(short, aes(protest_result.protester_violence, totalFaces_Mean)) + geom_point() + geom_smooth(method = "loess", se = FALSE, span=1) + labs(x=expression(Protester~Violence[paste(i,',','t')]), y=expression(Faces~per~Photo[paste(i,',','t')])) + theme_classic() + theme(axis.text.x = element_text( size=14), strip.text=element_text(size=14), axis.text.y = element_text(size=14), axis.title.y=element_text(size=14), axis.title.x=element_text(size=14)) + ylim(0,40)
ggsave('./Figures/correlation_loess_protesterviolence_facesPerPhoto_DonghyeonAlexmerged_cityday.jpg')
```

# Figure A20: Vector Autoregression
Followed here (https://www.r-econometrics.com/timeseries/varintro/) and here (https://cran.r-project.org/web/packages/vars/vignettes/vars.pdf).

## State violence
Below is for change in size.
```{r}
newdata <- short[,c('totalFaces_Sum','protest_result.protester_violence', 'protest_result.state_violence', 'police', 'fire', 'facesMale_Perc',   'youngAdultPerc', 'tweets')]
newdata$protest_result.state_violence2 <- newdata$protest_result.state_violence*newdata$protest_result.state_violence

lagselect <- VARselect(newdata, lag.max=50, type='const')  #AIC suggests 51, HQ suggests 6
plot(lagselect$criteria[1,], lty='dashed', type='l')
lines(x=1:50,y=lagselect$criteria[2,])

#### 41 lag, according AIC
m_var <- VAR(newdata, type='const', ic='AIC', lag.max=41)
# Calculate the IRF
ir.1 <- irf(m_var, impulse = "protest_result.state_violence", response = "totalFaces_Sum", n.ahead = 15, ortho = FALSE)
# Plot the IRF
jpeg(filename='./Figures/VAR_stateviolence_nolog_lag41_DonghyeonAlexmerged_classifiers_cityday.jpg', res=300, width=5, height=5, units="in")
plot(ir.1, main="", ylab=expression('%'~Change~of~Protest~Size[paste(i,',','t')]), mar=c(0,6,0,0))
dev.off()

# Calculate the IRF
ir.1 <- irf(m_var, impulse = "protest_result.state_violence2", response = "totalFaces_Sum", n.ahead = 15, ortho = FALSE)
# Plot the IRF
jpeg(filename='./Figures/VAR_stateviolencesquare_nolog_lag41_DonghyeonAlexmerged_classifiers_cityday.jpg', res=300, width=5, height=5, units="in")
plot(ir.1, ylab=expression('%'~Change~of~Protest~Size[paste(i,',','t')]), mar=c(0,6,0,0), main="")
dev.off()
```

## Protester violence
Below is for change in size.
```{r}
newdata <- short[,c('totalFaces_Sum','protest_result.protester_violence', 'protest_result.state_violence', 'police', 'fire', 'facesMale_Perc',   'youngAdultPerc', 'tweets')]
newdata$protest_result.state_violence2 <- newdata$protest_result.state_violence*newdata$protest_result.state_violence

m_var <- VAR(newdata, type='const', ic='AIC', lag.max=1)

# Calculate the IRF
ir.1 <- irf(m_var, impulse = "protest_result.protester_violence", response = "totalFaces_Sum", n.ahead = 15, ortho = FALSE)

# Plot the IRF
jpeg(filename='./Figures/VAR_protesterviolence_nolog_DonghyeonAlexmerged_classifiers_cityday.jpg', res=300, width=5, height=5, units="in")
plot(ir.1, main="", ylab=expression('%'~Change~of~Protest~Size[paste(i,',','t')]), mar=c(0,6,0,0))
dev.off()
```

# Table 6: Robustness, Strategic Behavior
## Dominant Language
```{r}
data <- read.csv('./Data/02_processedData/e_DonghyeonAlexmerged_cityday_UsersAndMissingDays_dominantLanguage.csv', stringsAsFactors=FALSE)
names(data) <- gsub('binary_', '', names(data))

data$country <- data$place.country_code  # Need to add country like I did earlier for main data

data <- dropDays_parent(data=data)
data <- addDemo(data=data)

short_dl <- makeShortData(data=data, thesecities=thesecities)

models_dl <- makeModels_crossValidate(short_dl, method='lm', DV='totalFaces_log', DV_lag='totalFaces_lag_log')
```

## No Bots
```{r}
data <- read.csv('./Data/02_processedData/e_DonghyeonAlexmerged_cityday_UsersAndMissingDays_noBot.csv', stringsAsFactors=FALSE)

names(data) <- gsub('binary_', '', names(data))

data$country <- data$place.country_code  # Need to add country like I did earlier for main data
data <- dropDays_parent(data=data)
data <- addDemo(data=data)

short <- makeShortData(data=data, thesecities=thesecities)

models_bots <- makeModels_crossValidate(short, method='lm', DV='totalFaces_log', DV_lag='totalFaces_lag_log')
```

## No Duplicates
```{r}
data <- read.csv('./Data/02_processedData/e_DonghyeonAlexmerged_cityday_UsersAndMissingDays_noDuplicate.csv', stringsAsFactors=FALSE)

names(data) <- gsub('binary_', '', names(data))

data$country <- data$place.country_code  # Need to add country like I did earlier for main data

data <- dropDays_parent(data=data)
data <- addDemo(data=data)

```


### Identify Cities
Use this object to filter countries wanted.  This is based on observing how many tweets per day they have from `ProcesssFromPythonAggregation` script.


Which cities to use?  5,109 have a day with a tweet.
```{r}
citydays <- data %>% group_by(city_use, country) %>% summarize(daysWithTweets=length(tweets[tweets>0]))

countrydays <- data %>% group_by(country) %>% summarize(days=length(unique(day)))

# Add number of days of each protest period
citydays <- merge(citydays, countrydays, by.x=c('country'), by.y=c('country'))

# Get rate
citydays$rate <- citydays$daysWithTweets/citydays$days
citydays <- citydays[order(citydays$rate, decreasing=TRUE),]

# Subset
#smaller <- citydays[citydays$country %in% thesecountries,]

percThreshold <- 1/7
thesecities_nodup <- citydays$city_use[citydays$rate>=percThreshold]
```

```{r}
short <- makeShortData(data=data, thesecities=thesecities_nodup)

models_dedup <- makeModels_crossValidate(short, method='lm', DV='totalFaces_log', DV_lag='totalFaces_lag_log')

models_dedup_weighted <- makeModels_crossValidate_weighted(short, method='lm', DV='totalFaces_log', DV_lag='totalFaces_lag_log')
```

## Popularity
```{r}
data <- read.csv('./Data/02_processedData/e_DonghyeonAlexmerged_cityday_UsersAndMissingDays_2575Popularity.csv', stringsAsFactors=FALSE)
names(data) <- gsub('binary_', '', names(data))

data$country <- data$place.country_code  # Need to add country like I did earlier for main data

data <- dropDays_parent(data=data)
data <- addDemo(data=data)

short <- makeShortData(data=data, thesecities=thesecities)

models_normalUser <- makeModels_crossValidate(short, method='lm', DV='totalFaces_log', DV_lag='totalFaces_lag_log')
```

## Users Twitter Images Shared
This section is where the dependent variable is the number of unique users who posted protest images per unit of aggregation.  The name of the variable is what was called `unique_ID` in the Python aggregation. 

### All-countries
Here are the models for all countries, count Twitter user as DV.  
```{r, error=TRUE, cache=FALSE, eval=TRUE, echo=TRUE}
short <- makeShortData(data=country, thesecities=thesecities)

models_users <- makeModels_crossValidate(short, method='lm', DV='Protesters', DV_lag='Protesters_lag')
```

## Logarithm Users Twitter Images Shared
This section is where the dependent variable is the logarithm of the number of accounts sharing a protest photo in a day.  

Make log variable of protesters.
```{r}
country$Protesters_log <- log(country$Protesters+1, 10)
country$Protesters_lag_log <- log(country$Protesters_lag+1, 10)
```

### All-countries

```{r, error=TRUE, cache=FALSE, eval=TRUE, echo=TRUE}
short <- makeShortData(data=country, thesecities=thesecities)

models_usersLog <- makeModels_crossValidate(short, method='lm', DV='Protesters_log', DV_lag='Protesters_lag_log')

```

## Make Table 6
``models`` needs to be run first.  It is the main model
```{r, error=TRUE, cache=FALSE, eval=TRUE, echo=TRUE, include=TRUE}
stargazer(models_dl$m_combined$finalModel, models_bots$m_combined$finalModel, models_dedup$m_combined$finalModel, models_normalUser$m_combined$finalModel, models_usersLog$m_combined$finalModel, type='latex', out='./Tables/table6.tex', column.labels=c('Dominant Language', 'No Bots', 'Deduplicated Images', 'IQR Users', 'DV: Log(Number of Users)'),  notes.align='l', digits=4, initial.zero=FALSE, dep.var.caption='DV: Sum of Faces', omit="factor", omit.stat=c('f', 'res.dev'), style='apsr', se = list(models_dl$m_combined_clusterSE, models_bots$m_combined_clusterSE, models_dedup$m_combined_clusterSE, models_normalUser$m_combined_clusterSE, models_usersLog$m_combined_clusterSE))
```

# Table 7: Robustness, Time Effects
## DV Lags
Below is with 15 lags, which the PACF plot says to do.  
```{r}
short <- makeShortData(data=country, thesecities=thesecities)

# need dplyr:: so R uses that package, otherwise is clash with plm's namesapce
temp <- data.frame(short %>% group_by(city_use) %>% mutate(totalFaces_lag_log2=dplyr::lag(totalFaces_log, 2),totalFaces_lag_log3=dplyr::lag(totalFaces_log, 3),totalFaces_lag_log4=dplyr::lag(totalFaces_log, 4),totalFaces_lag_log5=dplyr::lag(totalFaces_log, 5),totalFaces_lag_log6=dplyr::lag(totalFaces_log, 6),totalFaces_lag_log7=dplyr::lag(totalFaces_log, 7),totalFaces_lag_log8=dplyr::lag(totalFaces_log, 8),totalFaces_lag_log9=dplyr::lag(totalFaces_log, 9),totalFaces_lag_log10=dplyr::lag(totalFaces_log, 10),totalFaces_lag_log11=dplyr::lag(totalFaces_log, 11),totalFaces_lag_log12=dplyr::lag(totalFaces_log, 12),totalFaces_lag_log13=dplyr::lag(totalFaces_log, 13),totalFaces_lag_log14=dplyr::lag(totalFaces_log, 14),totalFaces_lag_log15=dplyr::lag(totalFaces_log, 15)))


# Below is with the 15 lags
m_lag15 <-  lm(totalFaces_log ~ protest_result.protester_violence_lag + protest_result.state_violence_lag + I(protest_result.state_violence_lag*protest_result.state_violence_lag) + police_lag + fire_lag +  facesMale_Perc_lag +  youngAdultPerc_lag +  tweets_lag + totalFaces_lag_log + totalFaces_lag_log2 +totalFaces_lag_log3 +totalFaces_lag_log4 +totalFaces_lag_log5 +totalFaces_lag_log6 +totalFaces_lag_log7 +totalFaces_lag_log8 +totalFaces_lag_log9 +totalFaces_lag_log10 +totalFaces_lag_log11 +totalFaces_lag_log12 +totalFaces_lag_log13 +totalFaces_lag_log14 +totalFaces_lag_log15 + factor(city_use), data=temp)

m_lag15_clusterSE <- coeftest(m_lag15, vcov=vcovHC(m_lag15, cluster='group'))[,2]
```

## Weekend, Weekday FE
Make sure to have run original model.
```{r}
# Below is with the weekend fixed effect 
m_weekend <-  lm(totalFaces_log ~ factor(weekend) + protest_result.protester_violence_lag + protest_result.state_violence_lag + I(protest_result.state_violence_lag*protest_result.state_violence_lag) + police_lag + fire_lag +  facesMale_Perc_lag +  youngAdultPerc_lag +  tweets_lag +totalFaces_lag_log+ factor(city_use), data=short)

m_weekend_clusterSE <- coeftest(m_weekend, vcov=vcovHC(m_weekend, cluster='group'))[,2]
#m_new1_clusterSE <- summary(m_new1, cluster=c('city_use'))$coefficients[,2]


# Below is with the weekday  fixed effect 
m_weekday <-  lm(totalFaces_log ~ factor(dayOfWeek) + protest_result.protester_violence_lag + protest_result.state_violence_lag + I(protest_result.state_violence_lag*protest_result.state_violence_lag) + police_lag + fire_lag +  facesMale_Perc_lag +  youngAdultPerc_lag +  tweets_lag +totalFaces_lag_log+ factor(city_use), data=short)

m_weekday_clusterSE <- coeftest(m_weekday, vcov=vcovHC(m_weekday, cluster='group'))[,2]
#m_new1b_clusterSE <- summary(m_new1b, cluster=c('city_use'))$coefficients[,2]
```

## Duration Controls
Control for how long a protest has gone on and how long state repression lasts for.
```{r}
temp <- makeShortData(data=country, thesecities=thesecities)


# Below is with the protest days control
m_protestDays <-  lm(totalFaces_log ~ protest_result.protester_violence_lag + protest_result.state_violence_lag + I(protest_result.state_violence_lag*protest_result.state_violence_lag) + police_lag + fire_lag +  facesMale_Perc_lag +  youngAdultPerc_lag +  tweets_lag + totalFaces_lag_log + protest_duration + factor(city_use), data=temp)

m_protestDays_clusterSE <- coeftest(m_protestDays, vcov=vcovHC(m_protestDays, cluster='group'))[,2]

# Below is with state duration control
m_svDuration <-  lm(totalFaces_log ~ protest_result.protester_violence_lag + protest_result.state_violence_lag + I(protest_result.state_violence_lag*protest_result.state_violence_lag) + police_lag + fire_lag +  facesMale_Perc_lag +  youngAdultPerc_lag +  tweets_lag + totalFaces_lag_log + state_violence_duration + factor(city_use), data=temp)

m_svDuration_clusterSE <- coeftest(m_svDuration, vcov=vcovHC(m_svDuration, cluster='group'))[,2]

# Below is with both durations
m_durations <-  lm(totalFaces_log ~ protest_result.protester_violence_lag + protest_result.state_violence_lag + I(protest_result.state_violence_lag*protest_result.state_violence_lag) + police_lag + fire_lag +  facesMale_Perc_lag +  youngAdultPerc_lag +  tweets_lag + totalFaces_lag_log + protest_duration + state_violence_duration + factor(city_use), data=temp)

m_durations_clusterSE <- coeftest(m_durations, vcov=vcovHC(m_durations, cluster='group'))[,2]
```

## Make Table 7
```{r}
stargazer(m_lag15, m_weekend, m_weekday, m_durations, type='latex', out='./Tables/table7.tex', column.labels=c('15 Lags', 'Weekend FE', 'Weekday FE', 'Duration Controls'),  notes.align='l', digits=4, initial.zero=FALSE, dep.var.caption='DV: Sum of Faces', omit="factor", omit.stat=c('f', 'res.dev'), style='apsr', se = list(m_lag15_clusterSE, m_weekend_clusterSE, m_weekday_clusterSE, m_durations_clusterSE))
```



# Table A6: Robustness, Higher Aggregation
## Country
```{r}
country <- read.csv('./Data/02_processedData/e_DonghyeonAlexmerged_countryday_UsersAndMissingDays.csv', stringsAsFactors=FALSE)
country$country <- country$place.country_code

country <- addDemo(data=country)
names(country) <- gsub('binary_', '', names(country))

these <- c('ES', 'HK', 'PK', 'KR', 'VE1', 'VE2')
country <- country[country$country %in% these,]
country <- dropDays_parent(data=country)

country$city_use <- country$country  # Will let me use the existing function made for cities


models_c <- makeModels_crossValidate(data=country, method='lm', DV='totalFaces_log', DV_lag='totalFaces_lag_log')
```

## State
```{r}
country <- read.csv('./Data/02_processedData/e_DonghyeonAlexmerged_stateday_UsersAndMissingDays.csv', stringsAsFactors=FALSE)
#country <- read.csv('./Data/02_processedData/e_DonghyeonAlexmerged_cityday_UsersAndMissingDays_noDuplicate.csv', stringsAsFactors=FALSE)
country$country <- country$place.country_code

country <- addDemo(data=country)
names(country) <- gsub('binary_', '', names(country))

these <- c('ES', 'HK', 'PK', 'KR', 'VE1', 'VE2')
country <- country[country$country %in% these,]
country <- dropDays_parent(data=country)

country$city_use <- country$rg.state  # Will let me use the existing function made for cities

# Drop states not frequent enough
# I keep the language of city because it is pasted from higher in the script, but notice above city_use == rg.state
citydays <- country %>% group_by(country, city_use) %>% summarize(daysWithTweets=length(tweets[tweets>0]))

countrydays <- country %>% group_by(country, city_use) %>% summarize(days=length(unique(day)))


# Add number of days of each protest period
citydays <- merge(citydays, countrydays, by.x=c('city_use'), by.y=c('city_use'))

# Get rate
citydays$rate <- citydays$daysWithTweets/citydays$days
citydays <- citydays[order(citydays$rate, decreasing=TRUE),]

# Subset
percThreshold <- 1/7  # For one day per week
thesecities <- citydays$city_use[citydays$rate>=percThreshold]

country <- country[country$city_use %in% thesecities,]

# Some states still only there once, get rid of
hm <- data.frame(table(country$city_use))
hm <- hm[hm$Freq >= 10,]
hm <- as.character(hm$Var1)

country <- country[country$city_use %in% hm,]
  
models_s <- makeModels_crossValidate(data=country, method='lm', DV='totalFaces_log', DV_lag='totalFaces_lag_log')
```

## Make Stargazer
```{r}
stargazer(models_s$m_combined$finalModel, models_c$m_combined$finalModel, type='latex', out='./Tables/robustness_stateAndCountryAgg.tex', column.labels=c('Unit: State', 'Unit: Country'),  notes.align='l', digits=4, initial.zero=FALSE, dep.var.caption='DV: Sum of Faces', omit="factor", omit.stat=c('f', 'res.dev'), style='apsr', se = list(models_s$m_combined_clusterSE, models_c$m_combined_clusterSE))
```


# Table A7: Robustness, Count Models and Zero Days
## Only days with protest
The numbering of the models is off because I pasted it from the previous section, where it was the fourth.
```{r}
short <- makeShortData(data=country, thesecities=thesecities)

temp <- short[short$tweets>0,]

m_new4 <-  lm(totalFaces_log ~ protest_result.protester_violence_lag + protest_result.state_violence_lag + I(protest_result.state_violence_lag*protest_result.state_violence_lag) + police_lag + fire_lag +  facesMale_Perc_lag +  youngAdultPerc_lag +  tweets_lag +totalFaces_lag_log+ factor(city_use), data=temp)

m_new4_clusterSE <- coeftest(m_new4, vcov=vcovHC(m_new4, cluster='group'))[,2]
```

## Count Models
```{r}
short <- makeShortData(data=country, thesecities=thesecities)

# Below is for original model
models <- makeModels_crossValidate(short, method='lm', DV='totalFaces_log', DV_lag='totalFaces_lag_log')

# This one is needed for zero inflated
combined <- totalFaces_Sum ~ protest_result.protester_violence_lag + protest_result.state_violence_lag + I(protest_result.state_violence_lag*protest_result.state_violence_lag) + police_lag + fire_lag +  facesMale_Perc_lag +  youngAdultPerc_lag + tweets_lag + totalFaces_Sum_lag + factor(city_use)

# NB: Have to drop city FE to get convergence, no cluster SE either.
combined_zinb <- totalFaces_Sum ~ protest_result.protester_violence_lag + protest_result.state_violence_lag + I(protest_result.state_violence_lag*protest_result.state_violence_lag) + police_lag + fire_lag +  facesMale_Perc_lag +  youngAdultPerc_lag +  tweets_lag + totalFaces_Sum_lag + factor(country)

poisson <- poissonmfx(formula=combined, data=short, robust=TRUE, clustervar1 = 'city_use')
negbinom <- negbinmfx(formula=combined, data=short, robust=TRUE, clustervar1='city_use')

# Can't get below to converge
zeronegbinom2 <- zeroinfl(formula=combined_zinb, data=short, dist = 'negbin', maxit = 1000)

```

## Make Table
```{r}
# Need to introduce cluster SE for m_new4 manually when put into paper
stargazer(models$m_combined$finalModel, m_new4, poisson$fit, negbinom$fit, zeronegbinom2, t.auto = TRUE, p.auto = TRUE, omit = c('factor'), omit.labels = c('City FE'), notes.align = 'l',table.placement = 'htpb!',ci.level = .95, out = './Tables/robustness_table_a7.tex', no.space = TRUE, dep.var.caption = 'Protesters per City-Day',style="apsr",initial.zero=TRUE,model.numbers=TRUE, multicolumn=TRUE, omit.stat = c('aic','theta'), column.labels=c('Original', 'No Zero Days', 'Poisson', 'Negative Binomial', 'Zero-inflated Negative Binomial'), digits=4)
```


# Table A8: Robustness, Alternative DV
## Raw DV
### All-countries
Here are the models for all countries, not logging the DV.
```{r, error=TRUE, cache=FALSE, eval=TRUE, echo=TRUE}
short <- makeShortData(data=country, thesecities=thesecities)

# Two, so can have full model
models2 <- makeModels_crossValidate(short, method='lm', DV='totalFaces_log', DV_lag='totalFaces_lag_log')
models <- makeModels_crossValidate(short, method='lm', DV='totalFaces_Sum', DV_lag='totalFaces_Sum_lag')
```


## Users Twitter Images Shared
This section is where the dependent variable is the number of unique users who posted protest images per unit of aggregation.  The name of the variable is what was called `unique_ID` in the Python aggregation. 

### All-countries
Here are the models for all countries, count Twitter user as DV.  
```{r, error=TRUE, cache=FALSE, eval=TRUE, echo=TRUE}
short <- makeShortData(data=country, thesecities=thesecities)

models3 <- makeModels_crossValidate(short, method='lm', DV='Protesters', DV_lag='Protesters_lag')
```


## Logarithm Users Twitter Images Shared
This section is where the dependent variable is the logarithm of the number of accounts sharing a protest photo in a day.  

Make log variable of protesters.
```{r}
country$Protesters_log <- log(country$Protesters+1, 10)
country$Protesters_lag_log <- log(country$Protesters_lag+1, 10)
```

### All-countries
```{r, error=TRUE, cache=FALSE, eval=TRUE, echo=TRUE}
short <- makeShortData(data=country, thesecities=thesecities)

models4 <- makeModels_crossValidate(short, method='lm', DV='Protesters_log', DV_lag='Protesters_lag_log')

```


## Coarsen DV
### All countries
```{r}
short <- makeShortData(data=country, thesecities=thesecities)

# Make ordinal, log
short$coarse_Protesters_log <- as.numeric(cut(short$totalFaces_Sum, breaks=c(0, 1, 10, 100, 1000), labels=c('0', '1', '2', '3'), right=FALSE))
short$coarse_Protesters_lag_log <- as.numeric(cut(short$totalFaces_Sum_lag, breaks=c(0, 1, 10, 100, 1000), labels=c('0', '1', '2', '3'), right=FALSE))

# Round to nearest ten
short$coarse_Protesters_ten <- round(short$totalFaces_Sum, digits=-1)
short$coarse_Protesters_lag_ten <- round(short$totalFaces_Sum_lag, digits=-1)

models5 <- makeModels_crossValidate(short, method='lm', DV='coarse_Protesters_log', DV_lag='coarse_Protesters_lag_log')
models6 <- makeModels_crossValidate(short, method='lm', DV='coarse_Protesters_ten', DV_lag='coarse_Protesters_lag_ten')
```

## Make One Latex Table
Below is table with coarsened DV added.
```{r}
stargazer(models2$m_combined$finalModel, models$m_combined$finalModel, models3$m_combined$finalModel,  models4$m_combined$finalModel, models5$m_combined$finalModel, models6$m_combined$finalModel, type='latex', out='./Tables/appendix_table_a8.tex', column.labels=c('Original', 'No Log', 'Number Users', 'Log(Number Users)', 'Ordinal (Log)', 'Ordinal (Tens)'),  notes.align='l', digits=4, initial.zero=FALSE, omit="factor", omit.stat=c('f', 'res.dev'), style='apsr', se = list(models2$m_combined_clusterSE, models$m_combined_clusterSE, models3$m_combined_clusterSE, models4$m_combined_clusterSE, models5$m_combined_clusterSE, models6$m_combined_clusterSE))
```


# Table A9: Robustness, Weighted by Tweets
```{r, error=TRUE, cache=FALSE, eval=TRUE, echo=TRUE, include=TRUE}
short <- makeShortData(data=country, thesecities=thesecities)

models_orig <- makeModels_crossValidate(data=short, method='lm', DV='totalFaces_log', DV_lag='totalFaces_lag_log')

# By tweets
models <- makeModels_crossValidate_weighted(data=short, method='lm', DV='totalFaces_log', DV_lag='totalFaces_lag_log', weight=short$tweets_lag)

# By inverse number of tweets
models2 <- makeModels_crossValidate_weighted(data=short, method='lm', DV='totalFaces_log', DV_lag='totalFaces_lag_log', weight=(1/(short$tweets_lag+1)))

stargazer(models_orig$m_combined$finalModel, models$m_combined$finalModel, models2$m_combined$finalModel, type='latex', out='./Tables/appendix_table_a9.tex', column.labels=c('Original', '# Tweets', '1/(# Tweets)', 'Dedup.', 'Dedup., New Cities'),  notes.align='l', digits=4, initial.zero=FALSE, dep.var.caption='DV: Sum of Faces', omit="factor", omit.stat=c('f', 'res.dev'), style='apsr', se = list(models_orig$m_combined_clusterSE, models$m_combined_clusterSE, models2$m_combined_clusterSE))
```


# Table A10: Robustness, Mobile and Bot Tweets
These models are those in Table A10.  I manually created that table, so the Stargazer output below has to be manually combined.
## Time Window
```{r}
data <- read.csv('./Data/02_processedData/e_DonghyeonAlexmerged_cityday_UsersAndMissingDays_timeNarrow.csv', stringsAsFactors=FALSE)
names(data) <- gsub('binary_', '', names(data))

data$country <- data$place.country_code  # Need to add country like I did earlier for main data

data <- dropDays_parent(data=data)
data <- addDemo(data=data)

short <- makeShortData(data=data, thesecities=thesecities)

models4 <- makeModels_crossValidate(short, method='lm', DV='totalFaces_log', DV_lag='totalFaces_lag_log')
```


```{r, error=TRUE, cache=FALSE, eval=TRUE, echo=TRUE, include=TRUE}
stargazer(models4$m_violence$finalModel, models4$m_demographics$finalModel, models4$m_combined$finalModel, type='latex', out='./Tables/robustness_timeNarrow.tex', column.labels=c('Violence', 'Demographics', 'Combined'),  notes.align='l', digits=4, initial.zero=FALSE, dep.var.caption='DV: Sum of Faces', omit="factor", omit.stat=c('f', 'res.dev'), style='apsr', se = list(models4$m_violence_clusterSE,models4$m_demographics_clusterSE, models4$m_combined_clusterSE))
```

## Only Mobile
```{r}
data <- read.csv('./Data/02_processedData/e_DonghyeonAlexmerged_cityday_UsersAndMissingDays_mobiletweets.csv', stringsAsFactors=FALSE)
names(data) <- gsub('binary_', '', names(data))

data$country <- data$place.country_code  # Need to add country like I did earlier for main data

data <- dropDays_parent(data=data)
data <- addDemo(data=data)

short <- makeShortData(data=data, thesecities=thesecities)

models2 <- makeModels_crossValidate(short, method='lm', DV='totalFaces_log', DV_lag='totalFaces_lag_log')
```


```{r, error=TRUE, cache=FALSE, eval=TRUE, echo=TRUE, include=TRUE}
stargazer(models2$m_violence$finalModel, models2$m_demographics$finalModel, models2$m_combined$finalModel, type='latex', out='./Tables/robustness_mobileTweets.tex', column.labels=c('Violence', 'Demographics', 'Combined'),  notes.align='l', digits=4, initial.zero=FALSE, dep.var.caption='DV: Sum of Faces', omit="factor", omit.stat=c('f', 'res.dev'), style='apsr', se = list(models2$m_violence_clusterSE,models2$m_demographics_clusterSE, models2$m_combined_clusterSE))
```

# Table A11: Robustness, Protest Tweets
Load, merge the new data.
```{r}
protest_tweets <- read.csv('./Data/02_processedData/d_robustness_protestTweets_allCountriesMerged_cityday.csv', stringsAsFactors=FALSE)

robustness_tweets <- merge(short, protest_tweets, on.x=c('place.country_code', 'rg.state', 'city_use', 'day'), on.y=c('place.country_code', 'rg.state', 'city_use', 'day'), all.x=TRUE, all.y=FALSE)
robustness_tweets$tweet_protest_lag[is.na(robustness_tweets$tweet_protest_lag)] <- 0  # NA is actually days w/ 0 protest tweets

m_rt <-  lm(totalFaces_log ~ protest_result.protester_violence_lag + protest_result.state_violence_lag + I(protest_result.state_violence_lag*protest_result.state_violence_lag) + police_lag + fire_lag +  facesMale_Perc_lag +  youngAdultPerc_lag + tweets_lag + tweet_protest_lag + totalFaces_lag_log+ factor(city_use), data=robustness_tweets)

m_rt_clusterSE <- coeftest(m_rt, vcov=vcovHC(m_rt, cluster='group'))[,2]

m_full <-  lm(totalFaces_log ~ protest_result.protester_violence_lag + protest_result.state_violence_lag + I(protest_result.state_violence_lag*protest_result.state_violence_lag) + police_lag + fire_lag +  facesMale_Perc_lag +  youngAdultPerc_lag +  tweets_lag +totalFaces_lag_log+ factor(city_use), data=short)

m_full_clusterSE <- coeftest(m_full, vcov=vcovHC(m_full, cluster='group'))[,2]

stargazer(m_full, m_rt, type='latex', out='./Tables/appendix_table_a11.tex', column.labels=c('Original Model', 'Protest Text Tweets'),  notes.align='l', digits=4, initial.zero=FALSE, dep.var.caption='DV: Sum of Faces', omit="factor", omit.stat=c('f', 'res.dev'), style='apsr', se = list(m_full_clusterSE, m_rt_clusterSE))

```



# Table A4: Onset of State Repression
```{r}
cities <- unique(short$city_use)
short_onset <- NULL
for(i in 1:length(cities)){
  temp <- short[short$city_use == cities[i],]
  index <- head(which(temp$state_violence>0), 1)
  temp$repression_onset <- 0
  if(length(index) > 0){  # If no repression, index is integer(0), which is no length
    temp$repression_onset[index] <- 1
    temp2 <- temp[1:index,]
  }
  if(length(index) == 0){
    temp2 <- temp
  }
  short_onset <- rbind(short_onset, temp2)
  
  }

m_onset <- glm(repression_onset ~ totalFaces_log + totalFaces_lag_log + factor(city_use), data=short_onset, family='binomial')
m_onset_ols <- lm(repression_onset ~ totalFaces_log + totalFaces_lag_log + factor(city_use), data=short_onset)

stargazer(m_onset, m_onset_ols, type='latex', out='./Tables/appendix_table_a4.tex',  notes.align='l', digits=4, initial.zero=FALSE, dep.var.caption='DV: First Day of Repression', omit="factor", omit.stat=c('f', 'res.dev'), style='apsr')
```

# Figure A11: Faces per Photo
Do large protests have more faces per photo?
```{r}
temp <- short
temp$country <- gsub('VE2', 'VE', temp$country)
temp$country <- gsub('VE1', 'VE', temp$country)

temp$facesPerTweet <- temp$n_face/temp$tweets

ggplot(temp, aes(x=n_face, y=facesPerTweet)) + geom_point(alpha=.5) + theme_classic() + geom_smooth(method='loess', span=10, se=FALSE) + xlab('Protest Size') + ylab('Faces per Protest Photo') + geom_abline(intercept=0, slope=1, lty='dotted')
ggsave('./Figures/robustness_figure_A11.jpg')
```


